In [117]:
import pandas as pd
import numpy as np
import string
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
products = pd.read_csv('amazon_baby.csv', dtype=[('name',np.str_), ('review', np.str_),('rating', np.float)])

In [3]:
len(products)


Out[3]:
183531

Cleanup dataset


In [15]:
remove_punct_map = dict.fromkeys(map(ord, string.punctuation))
def remove_punctuation(text):
    if type(text) == str and len(text.strip())!=0:
        return text.translate(remove_punct_map)
    else:
        return ''

In [16]:
#products['review_clean'] = products[pd.notnull(products['review'])]['review'].apply(remove_punctuation)
products['review_clean'] = products['review'].apply(remove_punctuation)

In [17]:
len(products)


Out[17]:
166752

In [18]:
products = products[products['rating'] != 3]

In [19]:
len(products)


Out[19]:
166752

In [20]:
products['sentiment'] = products['rating'].apply(lambda rating: 1 if rating > 3 else -1)

In [21]:
len(products[pd.isnull(products['review'])])


Out[21]:
777

Split data into train and test set


In [22]:
train_idx, test_idx = json.load(open('module-2-assignment-train-idx.json')), json.load(open('module-2-assignment-test-idx.json'))

In [23]:
train_data, test_data = products.iloc[train_idx], products.iloc[test_idx]
print(len(train_data))
#train_data, test_data = train_data[pd.notnull(train_data['review_clean'])], test_data[pd.notnull(test_data['review_clean'])]
print(len(train_data))
print(len(train_data[pd.isnull(train_data['review_clean'])]))


133416
133416
0

In [24]:
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
test_matrix = vectorizer.transform(test_data['review_clean'])

In [25]:
sentiment_model = LogisticRegression()
sentiment_model.fit(train_matrix, train_data['sentiment'])


Out[25]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [26]:
print(len(train_data['sentiment']))
print(sum(train_data['sentiment'].values))


133416
90912

In [27]:
print('Total number of coeeficients: %s' % len(sentiment_model.coef_[0]))
print('Number of positive coefficients: %s' %sum(sum(sentiment_model.coef_>=0)))


Total number of coeeficients: 121712
Number of positive coefficients: 86007

Making predictions with Logistic Regression


In [28]:
sample_test_data = test_data[10:13]

In [29]:
sample_test_data


Out[29]:
name review rating review_clean sentiment
59 Our Baby Girl Memory Book Absolutely love it and all of the Scripture in... 5 Absolutely love it and all of the Scripture in... 1
71 Wall Decor Removable Decal Sticker - Colorful ... Would not purchase again or recommend. The dec... 2 Would not purchase again or recommend The deca... -1
91 New Style Trailing Cherry Blossom Tree Decal R... Was so excited to get this product for my baby... 1 Was so excited to get this product for my baby... -1

In [30]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = sentiment_model.decision_function(sample_test_matrix)
print(scores)


[  5.60373533  -3.17167935 -10.41665959]

In [32]:
sample_test_data['predictions_proba'] = sentiment_model.predict_proba(sample_test_matrix)[:,1]


C:\Users\Viktor_Pishchulin\AppData\Local\Continuum\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [34]:
sample_test_data


Out[34]:
name review rating review_clean sentiment predictions_proba
59 Our Baby Girl Memory Book Absolutely love it and all of the Scripture in... 5 Absolutely love it and all of the Scripture in... 1 0.996329
71 Wall Decor Removable Decal Sticker - Colorful ... Would not purchase again or recommend. The dec... 2 Would not purchase again or recommend The deca... -1 0.040245
91 New Style Trailing Cherry Blossom Tree Decal R... Was so excited to get this product for my baby... 1 Was so excited to get this product for my baby... -1 0.000030

In [36]:
sample_test_data['predictions'] = sample_test_data['predictions_proba'].apply(lambda proba: 1 if proba >= 0.5 else -1)
sample_test_data


C:\Users\Viktor_Pishchulin\AppData\Local\Continuum\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
Out[36]:
name review rating review_clean sentiment predictions_proba predictions
59 Our Baby Girl Memory Book Absolutely love it and all of the Scripture in... 5 Absolutely love it and all of the Scripture in... 1 0.996329 1
71 Wall Decor Removable Decal Sticker - Colorful ... Would not purchase again or recommend. The dec... 2 Would not purchase again or recommend The deca... -1 0.040245 -1
91 New Style Trailing Cherry Blossom Tree Decal R... Was so excited to get this product for my baby... 1 Was so excited to get this product for my baby... -1 0.000030 -1

In [37]:
test_predictions = sentiment_model.predict_proba(test_matrix)[:,1]
test_data['predictions_proba'] = test_predictions
test_data['predictions'] = test_data['predictions_proba'].apply(lambda proba: 1 if proba >= 0.5 else -1)


C:\Users\Viktor_Pishchulin\AppData\Local\Continuum\Anaconda3\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
C:\Users\Viktor_Pishchulin\AppData\Local\Continuum\Anaconda3\lib\site-packages\ipykernel\__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()

In [150]:
test_data = test_data.sort_values('predictions_proba', ascending=False)

In [151]:
test_data[:20]


Out[151]:
name review rating review_clean sentiment predictions_proba predictions majority
97325 Freemie Hands-Free Concealable Breast Pump Col... I absolutely love this product. I work as a C... 5 I absolutely love this product I work as a Cu... 1 1 1 1
168697 Graco FastAction Fold Jogger Click Connect Str... Graco's FastAction Jogging Stroller definitely... 5 Gracos FastAction Jogging Stroller definitely ... 1 1 1 1
147949 Baby Jogger City Mini GT Single Stroller, Shad... Amazing, Love, Love, Love it !!! All 5 STARS a... 5 Amazing Love Love Love it All 5 STARS all the... 1 1 1 1
100166 Infantino Wrap and Tie Baby Carrier, Black Blu... I bought this carrier when my daughter was abo... 5 I bought this carrier when my daughter was abo... 1 1 1 1
140816 Diono RadianRXT Convertible Car Seat, Plum I bought this seat for my tall (38in) and thin... 5 I bought this seat for my tall 38in and thin 2... 1 1 1 1
66059 Evenflo 6 Pack Classic Glass Bottle, 4-Ounce It's always fun to write a review on those pro... 5 Its always fun to write a review on those prod... 1 1 1 1
133651 Britax 2012 B-Agile Stroller, Red [I got this stroller for my daughter prior to ... 4 I got this stroller for my daughter prior to t... 1 1 1 1
87017 Baby Einstein Around The World Discovery Center I am so HAPPY I brought this item for my 7 mon... 5 I am so HAPPY I brought this item for my 7 mon... 1 1 1 1
137034 Graco Pack 'n Play Element Playard - Flint My husband and I assembled this Pack n' Play l... 4 My husband and I assembled this Pack n Play la... 1 1 1 1
168081 Buttons Cloth Diaper Cover - One Size - 8 Colo... We are big Best Bottoms fans here, but I wante... 4 We are big Best Bottoms fans here but I wanted... 1 1 1 1
22586 Britax Decathlon Convertible Car Seat, Tiffany I researched a few different seats to put in o... 4 I researched a few different seats to put in o... 1 1 1 1
114796 Fisher-Price Cradle 'N Swing, My Little Snuga... My husband and I cannot state enough how much ... 5 My husband and I cannot state enough how much ... 1 1 1 1
50315 P'Kolino Silly Soft Seating in Tias, Green I've purchased both the P'Kolino Little Reader... 4 Ive purchased both the PKolino Little Reader C... 1 1 1 1
80155 Simple Wishes Hands-Free Breastpump Bra, Pink,... I just tried this hands free breastpump bra, a... 5 I just tried this hands free breastpump bra an... 1 1 1 1
180646 Mamas & Papas 2014 Urbo2 Stroller - Black After much research I purchased an Urbo2. It's... 4 After much research I purchased an Urbo2 Its e... 1 1 1 1
119182 Roan Rocco Classic Pram Stroller 2-in-1 with B... Great Pram Rocco!!!!!!I bought this pram from ... 5 Great Pram RoccoI bought this pram from Europe... 1 1 1 1
52631 Evenflo X Sport Plus Convenience Stroller - Ch... After seeing this in Parent's Magazine and rea... 5 After seeing this in Parents Magazine and read... 1 1 1 1
165593 Ikea 36 Pcs Kalas Kids Plastic BPA Free Flatwa... For the price this set is unbelievable- and tr... 5 For the price this set is unbelievable and tru... 1 1 1 1
147996 Baby Jogger City Mini GT Double Stroller, Shad... We are well pleased with this stroller, and I ... 4 We are well pleased with this stroller and I w... 1 1 1 1
182089 Summer Infant Wide View Digital Color Video Mo... I love this baby monitor. I can compare this ... 5 I love this baby monitor I can compare this o... 1 1 1 1

In [154]:
test_data.sort_values('predictions_proba', ascending=True)[:20]


Out[154]:
name review rating review_clean sentiment predictions_proba predictions majority
16042 Fisher-Price Ocean Wonders Aquarium Bouncer We have not had ANY luck with Fisher-Price pro... 2 We have not had ANY luck with FisherPrice prod... -1 8.852153e-16 -1 1
120209 Levana Safe N'See Digital Video Baby Monitor w... This is the first review I have ever written o... 1 This is the first review I have ever written o... -1 1.744114e-15 -1 1
77072 Safety 1st Exchangeable Tip 3 in 1 Thermometer I thought it sounded great to have different t... 1 I thought it sounded great to have different t... -1 8.123742e-14 -1 1
48694 Adiri BPA Free Natural Nurser Ultimate Bottle ... I will try to write an objective review of the... 2 I will try to write an objective review of the... -1 1.054774e-13 -1 1
155287 VTech Communications Safe & Sounds Full Co... This is my second video monitoring system, the... 1 This is my second video monitoring system the ... -1 1.686601e-13 -1 1
94560 The First Years True Choice P400 Premium Digit... Note: we never installed batteries in these un... 1 Note we never installed batteries in these uni... -1 4.182346e-13 -1 1
53207 Safety 1st High-Def Digital Monitor We bought this baby monitor to replace a diffe... 1 We bought this baby monitor to replace a diffe... -1 2.950471e-11 -1 1
81332 Cloth Diaper Sprayer--styles may vary I bought this sprayer out of desperation durin... 1 I bought this sprayer out of desperation durin... -1 3.791275e-11 -1 1
113995 Motorola Digital Video Baby Monitor with Room ... DO NOT BUY THIS BABY MONITOR!I purchased this ... 1 DO NOT BUY THIS BABY MONITORI purchased this m... -1 9.778506e-11 -1 1
10677 Philips AVENT Newborn Starter Set It's 3am in the morning and needless to say, t... 1 Its 3am in the morning and needless to say thi... -1 1.052455e-10 -1 1
9915 Cosco Alpha Omega Elite Convertible Car Seat I bought this car seat after both seeing the ... 1 I bought this car seat after both seeing the ... -1 4.421056e-10 -1 1
59546 Ellaroo Mei Tai Baby Carrier - Hershey This is basically an overpriced piece of fabri... 1 This is basically an overpriced piece of fabri... -1 4.446509e-10 -1 1
172090 Belkin WeMo Wi-Fi Baby Monitor for Apple iPhon... I read so many reviews saying the Belkin WiFi ... 2 I read so many reviews saying the Belkin WiFi ... -1 5.988615e-10 -1 1
75994 Peg-Perego Tatamia High Chair, White Latte I can see why there are so many good reviews o... 2 I can see why there are so many good reviews o... -1 6.423164e-10 -1 1
40079 Chicco Cortina KeyFit 30 Travel System in Adve... My wife and I have used this system in two car... 1 My wife and I have used this system in two car... -1 6.621328e-10 -1 1
149987 NUK Cook-n-Blend Baby Food Maker It thought this would be great. I did a lot of... 1 It thought this would be great I did a lot of ... -1 7.253816e-10 -1 1
154878 VTech Communications Safe & Sound Digital ... First, the distance on these are no more than ... 1 First the distance on these are no more than 7... -1 8.797598e-10 -1 1
1116 Safety 1st Deluxe 4-in-1 Bath Station This item is junk. I originally chose it beca... 1 This item is junk I originally chose it becau... -1 1.107440e-09 -1 1
83234 Thirsties Hemp Inserts 2 Pack, Small 6-18 Lbs My Experience: Babykicks Inserts failure vs RA... 5 My Experience Babykicks Inserts failure vs RAV... 1 1.620623e-09 -1 1
31741 Regalo My Cot Portable Bed, Royal Blue If I could give this product zero stars I woul... 1 If I could give this product zero stars I woul... -1 1.629788e-09 -1 1

In [153]:
print('Accuracy for test data: %s' % accuracy_score(test_data['sentiment'], test_data['predictions']))


Accuracy for test data: 0.932265418766

In [157]:
test_data.ix[94560]['name']


Out[157]:
'The First Years True Choice P400 Premium Digital Monitor, 2 Parent Unit'

Learn another classifier with fewer words


In [54]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [55]:
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words)

In [142]:
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])

Train a logistic regression model on a subset of data


In [59]:
simple_model = LogisticRegression()
simple_model.fit(train_matrix_word_subset, train_data['sentiment'])


Out[59]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [63]:
simple_model.coef_.flatten()


Out[63]:
array([ 1.36368976,  0.94399959,  1.19253827,  0.08551278,  0.52018576,
        1.50981248,  1.67307389,  0.50376046,  0.19090857,  0.05885467,
       -1.65157634, -0.20956286, -0.51137963, -2.03369861, -2.34829822,
       -0.62116877, -0.32055624, -0.89803074, -0.36216674, -2.10933109])

In [104]:
simple_model_coef_table = pd.DataFrame(columns=['word','coef'], index=significant_words)
simple_model_coef_table['word'] = significant_words

In [105]:
simple_model_coef_table['coef'] = simple_model.coef_.flatten()

In [120]:
len(simple_model_coef_table[simple_model_coef_table['coef']>=0])


Out[120]:
10

In [155]:
simple_model_coef_table


Out[155]:
word coef
love love 1.363690
great great 0.944000
easy easy 1.192538
old old 0.085513
little little 0.520186
perfect perfect 1.509812
loves loves 1.673074
well well 0.503760
able able 0.190909
car car 0.058855
broke broke -1.651576
less less -0.209563
even even -0.511380
waste waste -2.033699
disappointed disappointed -2.348298
work work -0.621169
product product -0.320556
money money -0.898031
would would -0.362167
return return -2.109331

In [95]:
full_model_coef_table = pd.DataFrame(columns=['word','coef'], index=vectorizer.vocabulary_)
full_model_coef_table['word'] = vectorizer.vocabulary_
full_model_coef_table['coef'] = sentiment_model.coef_.flatten()
full_model_coef_table = full_model_coef_table.sort_values('coef', ascending=False)

In [122]:
combined_table = simple_model_coef_table.join(full_model_coef_table, on='word', how='left', lsuffix="_simple")

In [123]:
combined_table.sort_values('coef_simple', ascending=False)


Out[123]:
word_simple coef_simple word coef
loves loves 1.673074 loves 0.021214
perfect perfect 1.509812 perfect 0.000403
love love 1.363690 love 0.000027
easy easy 1.192538 easy 0.237364
great great 0.944000 great 0.000129
little little 0.520186 little 0.018753
well well 0.503760 well -0.330530
able able 0.190909 able -0.212967
old old 0.085513 old 0.000006
car car 0.058855 car 0.000005
less less -0.209563 less -0.464299
product product -0.320556 product 0.020904
would would -0.362167 would -0.323012
even even -0.511380 even 0.081014
work work -0.621169 work -0.033034
money money -0.898031 money -0.000002
broke broke -1.651576 broke 0.006126
waste waste -2.033699 waste -0.148080
return return -2.109331 return 0.008149
disappointed disappointed -2.348298 disappointed 0.055364

Compare models


In [129]:
train_data['predictions'] = sentiment_model.predict(train_matrix)


C:\Users\Viktor_Pishchulin\AppData\Local\Continuum\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [141]:
sm_train_accurace = accuracy_score(train_data['sentiment'], sentiment_model.predict(train_matrix))
print('Sentiment model train accuracy: %s' % sm_train_accurace)
simple_train_accuracy = accuracy_score(train_data['sentiment'], simple_model.predict(train_matrix_word_subset))
print('Simple model train accuracy: %s' % simple_train_accuracy)


Sentiment model train accuracy: 0.967964861786
Simple model train accuracy: 0.866822570007

In [143]:
sm_test_accuracy = accuracy_score(test_data['sentiment'], sentiment_model.predict(test_matrix))
print('Sentiment model test accuracy: %s' % sm_test_accuracy)
simple_test_accuracy = accuracy_score(test_data['sentiment'], simple_model.predict(test_matrix_word_subset))
print('Simple model test accuracy: %s' % simple_test_accuracy)


Sentiment model test accuracy: 0.745800335973
Simple model test accuracy: 0.869360451164

In [148]:
test_data['majority'] = 1

In [149]:
accuracy_score(test_data['sentiment'], test_data['majority'])


Out[149]:
0.84278257739380846

In [ ]: